library(tidyverse)
## -- Attaching packages ------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.4
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts --------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.6.3
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
data <- read.csv("C:/Data/spotify/data.csv")
data_w_genres <- read.csv("C:/Data/spotify/data_w_genres.csv")

head(data)
##   acousticness                           artists danceability duration_ms
## 1     0.991000                   ['Mamie Smith']        0.598      168333
## 2     0.643000         ["Screamin' Jay Hawkins"]        0.852      150200
## 3     0.993000                   ['Mamie Smith']        0.647      163827
## 4     0.000173               ['Oscar Velazquez']        0.730      422087
## 5     0.295000                          ['Mixe']        0.704      165224
## 6     0.996000 ['Mamie Smith & Her Jazz Hounds']        0.424      198627
##   energy explicit                     id instrumentalness key liveness loudness
## 1  0.224        0 0cS0A1fUEUd1EW3FcF8AEI         5.22e-04   5   0.3790  -12.628
## 2  0.517        0 0hbkKFIJm7Z05H8Zl9w30f         2.64e-02   5   0.0809   -7.261
## 3  0.186        0 11m7laMUgmOKqI3oYzuhne         1.76e-05   0   0.5190  -12.098
## 4  0.798        0 19Lc5SfJJ5O1oaxY0fpwfh         8.01e-01   2   0.1280   -7.311
## 5  0.707        1 2hJjbsLCytGsnAHfdsLejp         2.46e-04  10   0.4020   -6.036
## 6  0.245        0 3HnrHGLE9u2MjHtdobfWl9         7.99e-01   5   0.2350  -11.470
##   mode                                                  name popularity
## 1    0                              Keep A Song In Your Soul         12
## 2    0                                  I Put A Spell On You          7
## 3    1                                          Golfing Papa          4
## 4    1 True House Music - Xavier Santos & Carlos Gomix Remix         17
## 5    0                                             Xuniverxe          2
## 6    1                           Crazy Blues - 78rpm Version          9
##   release_date speechiness   tempo valence year
## 1         1920      0.0936 149.976  0.6340 1920
## 2   1920-01-05      0.0534  86.889  0.9500 1920
## 3         1920      0.1740  97.600  0.6890 1920
## 4   1920-01-01      0.0425 127.997  0.0422 1920
## 5   1920-10-01      0.0768 122.076  0.2990 1920
## 6         1920      0.0397 103.870  0.4770 1920
head(data_w_genres)
##                                                             artists
## 1                                  "Cats" 1981 Original London Cast
## 2                                         "Cats" 1983 Broadway Cast
## 3                     "Fiddler On The Roofâ\200\235 Motion Picture Chorus
## 4                  "Fiddler On The Roofâ\200\235 Motion Picture Orchestra
## 5   "Joseph And The Amazing Technicolor Dreamcoat" 1991 London Cast
## 6 "Joseph And The Amazing Technicolor Dreamcoat" 1992 Canadian Cast
##   acousticness danceability duration_ms    energy instrumentalness  liveness
## 1    0.5985000    0.4701000    267072.0 0.3762030      0.010260876 0.2830500
## 2    0.8625385    0.4417308    287280.0 0.4068077      0.081158264 0.3152154
## 3    0.8565714    0.3482857    328920.0 0.2865714      0.024592949 0.3257857
## 4    0.8849259    0.4250741    262891.0 0.2457704      0.073587279 0.2754815
## 5    0.5107143    0.4671429    270436.1 0.4882857      0.009400291 0.1950000
## 6    0.5911667    0.4843333    218504.5 0.3006083      0.007042273 0.1760667
##    loudness speechiness     tempo   valence popularity key mode count
## 1 -14.43430  0.20915000 114.12880 0.3583200   38.20000   5    1    10
## 2 -10.69000  0.17621154 103.04415 0.2688654   31.53846   5    1    26
## 3 -15.23071  0.11851429  77.37586 0.3548571   34.57143   0    1     7
## 4 -15.63937  0.12320000  88.66763 0.3720296   34.40741   0    1    27
## 5 -10.23671  0.09854286 122.83586 0.4822857   42.00000   5    1     7
## 6 -18.57950  0.10495833 122.51783 0.4245000   33.16667   9    1    24
##           genres
## 1 ['show tunes']
## 2             []
## 3             []
## 4             []
## 5             []
## 6             []
key_map <- rev(c("0" = "C", "1" = "C#", "2" = "D", "3" = "D#", "4" = "E", "5" = "F", "6" = "F#", "7" = "G", "8" = "G#", "9" = "A", "10" = "A#", "11" = "B"))

 data_2010_2020 <- data %>% 
  mutate(id = as.character(id)) %>% 
  #select(-id, -release_date) %>% # duplicate records for different ids and release date
  filter(year <= 2020) %>% 
  distinct() %>% 
  # same song with same artist have multiple records with slight changes in audio featues and year
  group_by(name, artists) %>% 
  filter(id==min(id)) %>% 
  #filter(popularity == max(popularity)) %>% 
  ungroup() %>% 
   mutate(popularity_category = ifelse(popularity >= 80, "80+", "<80"),
          valence_bin = cut(valence, seq(0,1,0.1), right = FALSE),
          duration_min = duration_ms/(1000*60),
          mode_type = case_when(mode==0 ~ "minor",
                                mode==1 ~ "major"),
          key_str = as.character(key),
          key_group = str_replace_all(key_str, key_map))
summary(data_2010_2020)
##   acousticness                                    artists        danceability  
##  Min.   :0.0000   ['Tadeusz Dolega Mostowicz']        :  1281   Min.   :0.000  
##  1st Qu.:0.0926   ['ЭÑ\200неÑ\201Ñ‚ ХемингуÑ\215й'] :  1175   1st Qu.:0.413  
##  Median :0.5310   ['ЭÑ\200их МаÑ\200иÑ\217 РемаÑ\200к']:  1062   Median :0.546  
##  Mean   :0.5045   ['Francisco Canaro']                :   918   Mean   :0.535  
##  3rd Qu.:0.8970   ['Frank Sinatra']                   :   586   3rd Qu.:0.667  
##  Max.   :0.9960   ['Ignacio Corsini']                 :   555   Max.   :0.988  
##                   (Other)                             :153004                  
##   duration_ms          energy          explicit           id           
##  Min.   :   4937   Min.   :0.0000   Min.   :0.0000   Length:158581     
##  1st Qu.: 165440   1st Qu.:0.2460   1st Qu.:0.0000   Class :character  
##  Median : 205027   Median :0.4590   Median :0.0000   Mode  :character  
##  Mean   : 232021   Mean   :0.4792   Mean   :0.0701                     
##  3rd Qu.: 265000   3rd Qu.:0.7050   3rd Qu.:0.0000                     
##  Max.   :5338302   Max.   :1.0000   Max.   :1.0000                     
##                                                                        
##  instrumentalness        key            liveness         loudness      
##  Min.   :0.000000   Min.   : 0.000   Min.   :0.0000   Min.   :-60.000  
##  1st Qu.:0.000000   1st Qu.: 2.000   1st Qu.:0.0998   1st Qu.:-14.998  
##  Median :0.000505   Median : 5.000   Median :0.1390   Median :-10.916  
##  Mean   :0.194571   Mean   : 5.204   Mean   :0.2129   Mean   :-11.824  
##  3rd Qu.:0.237000   3rd Qu.: 8.000   3rd Qu.:0.2730   3rd Qu.: -7.572  
##  Max.   :1.000000   Max.   :11.000   Max.   :1.0000   Max.   :  3.855  
##                                                                        
##       mode                      name          popularity     release_date   
##  Min.   :0.000   White Christmas  :    90   Min.   : 0.00   1945   :  1272  
##  1st Qu.:0.000   Winter Wonderland:    77   1st Qu.: 1.00   1935   :  1081  
##  Median :1.000   Silent Night     :    74   Median :26.00   1949   :  1070  
##  Mean   :0.703   Jingle Bells     :    59   Mean   :25.75   1926   :  1010  
##  3rd Qu.:1.000   2000 Years       :    56   3rd Qu.:42.00   1950   :   968  
##  Max.   :1.000   Sleigh Ride      :    46   Max.   :96.00   1948   :   942  
##                  (Other)          :158179                   (Other):152238  
##   speechiness         tempo           valence           year     
##  Min.   :0.0000   Min.   :  0.00   Min.   :0.000   Min.   :1920  
##  1st Qu.:0.0352   1st Qu.: 93.65   1st Qu.:0.312   1st Qu.:1955  
##  Median :0.0456   Median :115.64   Median :0.536   Median :1976  
##  Mean   :0.1092   Mean   :116.88   Mean   :0.525   Mean   :1976  
##  3rd Qu.:0.0773   3rd Qu.:135.00   3rd Qu.:0.743   3rd Qu.:1998  
##  Max.   :0.9710   Max.   :243.51   Max.   :1.000   Max.   :2020  
##                                                                  
##  popularity_category    valence_bin     duration_min       mode_type        
##  Length:158581       [0.5,0.6):19648   Min.   : 0.08228   Length:158581     
##  Class :character    [0.6,0.7):19000   1st Qu.: 2.75733   Class :character  
##  Mode  :character    [0.7,0.8):17914   Median : 3.41712   Mode  :character  
##                      [0.4,0.5):17264   Mean   : 3.86701                     
##                      [0.3,0.4):17071   3rd Qu.: 4.41667                     
##                      (Other)  :67680   Max.   :88.97170                     
##                      NA's     :    4                                        
##    key_str           key_group        
##  Length:158581      Length:158581     
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 
data_2010_2020 %>% 
  count(year) %>% 
  ggplot(aes(x = year, y = n, group = 1)) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = seq(1910, 2020, 10)) +
  theme(axis.text.x = element_text(angle = 90))

data_2010_2020 %>% 
  group_by(year) %>%
  summarise(mean_popularity = mean(popularity),
            max_popularity = max(popularity)) %>% 
  ggplot(aes(x = year, group = 1)) +
  geom_line(aes(y = mean_popularity), color = "blue") +
  geom_line(aes(y = max_popularity), color = "green") +
  scale_x_continuous(breaks = seq(1910, 2020, 10)) +
  theme(axis.text.x = element_text(angle = 90))

audio_features <- c("acousticness", "danceability", "duration_min", "energy", "instrumentalness", "explicit", "liveness", "loudness", "key", "mode", "speechiness", "tempo", "valence")

data_2010_2020 %>% 
  filter(popularity==max(popularity)) %>% 
  pivot_longer(cols = audio_features, names_to = "feature_name", values_to = "feature_value") %>% 
  ggplot(aes(x = name, y = feature_value, fill = name)) +
  geom_col() +
  facet_wrap(~feature_name, scales = "free_y")
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(audio_features)` instead of `audio_features` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

rescale <- function(x) (x-min(x))/(max(x) - min(x))
scales_data_2010_2020 <- data_2010_2020 %>% 
  mutate(year = as.character(year)) %>% 
  mutate_if(is.numeric, ~rescale(.)) %>% 
  mutate(year = as.integer(year))
audio_features_2 <- c("acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "tempo", "valence")
scales_data_2010_2020 %>% 
  pivot_longer(cols = audio_features_2, names_to = "feature_name", values_to = "feature_value") %>% 
  group_by(year, feature_name) %>% 
  summarise(mean_feature_value = mean(feature_value)) %>% 
  ungroup() %>% 
  ggplot(aes(x = year, y = mean_feature_value, color = feature_name)) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = seq(1910, 2020, 10)) +
  scale_color_brewer(palette = "Set3")
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(audio_features_2)` instead of `audio_features_2` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

major minor

data_2010_2020 %>% 
  count(year, mode_type) %>% 
  ggplot(aes(x = year, y = n, fill = mode_type, group = 1)) +
  geom_col() +
  scale_x_continuous(breaks = seq(1910, 2020, 10)) 

explicit

data_2010_2020 %>% 
  count(year, explicit) %>% 
  ggplot(aes(x = year, y = n, fill = factor(explicit))) +
  geom_col() +
  scale_x_continuous(breaks = seq(1910, 2020, 10))

key

data_2010_2020 %>% 
  count(year, key_group) %>% 
  group_by(year) %>% 
  mutate(perc = n/sum(n)) %>% 
  ggplot(aes(x = year, y = perc, fill = key_group)) +
  geom_col() +
  scale_fill_brewer(palette = "Paired") +
  scale_x_continuous(breaks = seq(1910, 2020, 10))

duration

data_2010_2020 %>% 
  group_by(year) %>% 
  mutate(mean_dur = mean(duration_min)) %>% 
  ggplot(aes(x = year, y = mean_dur)) +
  geom_line() + geom_point() +
  scale_x_continuous(breaks = seq(1910, 2020, 10))

data_2010_2020 %>% 
  filter(danceability==max(danceability))
## # A tibble: 1 x 25
##   acousticness artists danceability duration_ms energy explicit id   
##          <dbl> <fct>          <dbl>       <int>  <dbl>    <int> <chr>
## 1       0.0755 ['Tone~        0.988      248160  0.633        0 5YIF~
## # ... with 18 more variables: instrumentalness <dbl>, key <int>,
## #   liveness <dbl>, loudness <dbl>, mode <int>, name <fct>, popularity <int>,
## #   release_date <fct>, speechiness <dbl>, tempo <dbl>, valence <dbl>,
## #   year <int>, popularity_category <chr>, valence_bin <fct>,
## #   duration_min <dbl>, mode_type <chr>, key_str <chr>, key_group <chr>
data_2010_2020 %>% 
  filter(tempo==max(tempo))
## # A tibble: 1 x 25
##   acousticness artists danceability duration_ms energy explicit id   
##          <dbl> <fct>          <dbl>       <int>  <dbl>    <int> <chr>
## 1        0.497 ['Bill~        0.535      277221   0.38        0 56n7~
## # ... with 18 more variables: instrumentalness <dbl>, key <int>,
## #   liveness <dbl>, loudness <dbl>, mode <int>, name <fct>, popularity <int>,
## #   release_date <fct>, speechiness <dbl>, tempo <dbl>, valence <dbl>,
## #   year <int>, popularity_category <chr>, valence_bin <fct>,
## #   duration_min <dbl>, mode_type <chr>, key_str <chr>, key_group <chr>
ggplotly(
  melt(cor(data_2010_2020 %>% 
      select(c(audio_features_2, "popularity")))) %>% 
  ggplot(aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "orange", mid = "purple") +
  theme(axis.text.x = element_text(angle = 90))
)
data_2010_2020 %>% 
  mutate(acoustic_bin = cut(acousticness, seq(0,1.1,0.0001), right = FALSE)) %>%
  group_by(acoustic_bin) %>%
  summarise(mean_popularity = mean(popularity),
         mean_acoustic = mean(acousticness)) %>%
  ungroup() %>%
  ggplot(aes(x = mean_acoustic, y = mean_popularity)) +
  geom_point(alpha = 0.2, size = 3) +
  geom_smooth(method = "lm")

  #geom_point(alpha = 0.8, size = 3, color = "red", pch = 21, fill = "black")
data_2010_2020 %>% 
  mutate(energy_bin = cut(energy, seq(0,1.1,0.0001), right = FALSE)) %>%
  group_by(energy_bin) %>%
  summarise(mean_popularity = mean(popularity),
         mean_energy = mean(energy)) %>%
  ungroup() %>%
  ggplot(aes(x = mean_energy, y = mean_popularity)) +
  geom_point(alpha = 0.2, size = 3) +
  geom_smooth(method = "lm")

  #geom_point(alpha = 0.8, size = 3, color = "red", pch = 21, fill = "black")
  
  data_2010_2020 %>% 
    select(energy) %>% pull() %>% min()
## [1] 0
data_2010_2020 %>% 
  mutate(loudness_bin = cut(loudness, seq(0,-60,-0.01), right = FALSE)) %>%
  group_by(loudness_bin) %>%
  summarise(mean_popularity = mean(popularity),
         mean_loudness = mean(loudness)) %>%
  ungroup() %>%
  ggplot(aes(x = mean_loudness, y = mean_popularity)) +
  geom_point(alpha = 0.2, size = 3) +
  geom_smooth(method = "lm")
## Warning: Factor `loudness_bin` contains implicit NA, consider using
## `forcats::fct_explicit_na`

  #geom_point(alpha = 0.8, size = 3, color = "red", pch = 21, fill = "black")
  
  data_2010_2020 %>% 
    select(loudness) %>% pull() %>% min()
## [1] -60
data_2010_2020 %>% 
  mutate(instr_bin = cut(instrumentalness, seq(0,1,0.0001), right = FALSE)) %>%
  group_by(instr_bin) %>%
  summarise(mean_popularity = mean(popularity),
         mean_instr = mean(instrumentalness)) %>%
  ungroup() %>%
  ggplot(aes(x = mean_instr, y = mean_popularity)) +
  geom_point(alpha = 0.2, size = 3) +
  geom_smooth(method = "lm")
## Warning: Factor `instr_bin` contains implicit NA, consider using
## `forcats::fct_explicit_na`

  #geom_point(alpha = 0.8, size = 3, color = "red", pch = 21, fill = "black")
  
  data_2010_2020 %>% 
    select(loudness) %>% pull() %>% min()
## [1] -60
data_2010_2020 %>% 
  mutate(dance_bin = cut(danceability, seq(0,1,0.0001), right = FALSE)) %>%
  group_by(dance_bin) %>%
  summarise(mean_valence = mean(valence),
         mean_dance = mean(danceability)) %>%
  ungroup() %>%
  ggplot(aes(x = mean_dance, y = mean_valence)) +
  geom_point(alpha = 0.2, size = 3) +
  geom_smooth(method = "lm")

  #geom_point(alpha = 0.8, size = 3, color = "red", pch = 21, fill = "black")
data_2010_2020 %>% 
  group_by(artists) %>%
  summarise(n_songs = n(),
            first_activity = min(year),
            last_activity = max(year)) %>% 
  ungroup() %>% 
  mutate(years_active = last_activity - first_activity + 1) %>% 
  arrange(desc(n_songs)) %>% 
  head(20) %>% 
  ggplot(aes(x = reorder(artists, n_songs), y = n_songs, fill = years_active)) + 
  geom_col() +
  coord_flip()

genre_group <- c("pop","indie","rock","metal","rap","jazz", "classical")
data_w_genres %>%
  mutate(genre_group = case_when(grepl("pop", genres) ~ "pop",
                                grepl("indie", genres) ~ "indie",
                                grepl("rock", genres) ~ "rock",
                                grepl("metal", genres) ~ "metal",
                                grepl("rap", genres) ~ "rap",
                                grepl("jazz", genres) ~ "jazz",
                                grepl("classical", genres) ~ "classical",
                                grepl("tango", genres) ~ "tango",
                                TRUE ~ "other")) %>% 
  arrange(desc(count)) %>% 
  head(30) %>% 
  ggplot(aes(x = reorder(artists, count), y = count, fill = genre_group)) + 
  geom_col() +
  coord_flip()

data_w_genres %>%
  mutate(genre_group = case_when(grepl("pop", genres) ~ "pop",
                                grepl("indie", genres) ~ "indie",
                                grepl("rock", genres) ~ "rock",
                                grepl("metal", genres) ~ "metal",
                                grepl("rap", genres) ~ "rap",
                                grepl("jazz", genres) ~ "jazz",
                                grepl("classical", genres) ~ "classical",
                                grepl("tango", genres) ~ "tango",
                                TRUE ~ "other")) %>% 
  arrange(desc(count)) %>% 
  head(30) %>% 
  ggplot(aes(x = reorder(artists, count), y = popularity, fill = genre_group)) + 
  geom_col() +
  coord_flip()

data_w_genres %>%
  mutate(genre_group = case_when(grepl("pop", genres) ~ "pop",
                                grepl("indie", genres) ~ "indie",
                                grepl("rock", genres) ~ "rock",
                                grepl("metal", genres) ~ "metal",
                                grepl("rap", genres) ~ "rap",
                                grepl("jazz", genres) ~ "jazz",
                                grepl("classical", genres) ~ "classical",
                                grepl("tango", genres) ~ "tango",
                                TRUE ~ "other")) %>% 
  arrange(desc(popularity)) %>% 
  head(30) %>% 
  ggplot(aes(x = reorder(artists, popularity), y = popularity, fill = genre_group)) + 
  geom_col() +
  coord_flip()

data_w_genres %>% 
  #mutate(dance_bin = cut(danceability, seq(0,1,0.0001), right = FALSE)) %>%
  #group_by(dance_bin) %>%
  #summarise(mean_valence = mean(valence),
  #       mean_dance = mean(danceability)) %>%
  #ungroup() %>%
  ggplot(aes(x = count, y = popularity)) +
  geom_point(alpha = 0.2, size = 3) 

cor(data_w_genres$count, data_w_genres$popularity)
## [1] -0.000227884
data_2010_2020 %>% 
  filter(artists=="['Lata Mangeshkar']") %>% 
  arrange(desc(popularity))
## # A tibble: 323 x 25
##    acousticness artists danceability duration_ms energy explicit id   
##           <dbl> <fct>          <dbl>       <int>  <dbl>    <int> <chr>
##  1        0.983 ['Lata~        0.433      238267  0.497        0 2DG0~
##  2        0.904 ['Lata~        0.447      353653  0.427        0 0mMs~
##  3        0.788 ['Lata~        0.389      354427  0.372        0 5M7L~
##  4        0.308 ['Lata~        0.611      220107  0.293        0 5nqD~
##  5        0.875 ['Lata~        0.566      315613  0.412        0 1O5q~
##  6        0.988 ['Lata~        0.731      188560  0.314        0 4CHM~
##  7        0.907 ['Lata~        0.435      509466  0.449        0 167v~
##  8        0.988 ['Lata~        0.425      286297  0.449        0 5y1l~
##  9        0.974 ['Lata~        0.551      242507  0.323        0 2QT3~
## 10        0.962 ['Lata~        0.721      153453  0.293        0 5PTS~
## # ... with 313 more rows, and 18 more variables: instrumentalness <dbl>,
## #   key <int>, liveness <dbl>, loudness <dbl>, mode <int>, name <fct>,
## #   popularity <int>, release_date <fct>, speechiness <dbl>, tempo <dbl>,
## #   valence <dbl>, year <int>, popularity_category <chr>, valence_bin <fct>,
## #   duration_min <dbl>, mode_type <chr>, key_str <chr>, key_group <chr>
scales_data_2010_2020 %>% 
  filter(artists=="['Lata Mangeshkar']") %>% 
  pivot_longer(audio_features_2, names_to = "feature_name", values_to = "feature_value") %>% 
  ggplot(aes(x = feature_name, y = feature_value, color = feature_name)) +
  geom_jitter(size = 3, alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 90))

data_2010_2020 %>% 
  filter(artists=="['The Beatles']") %>% 
  arrange(desc(popularity))
## # A tibble: 327 x 25
##    acousticness artists danceability duration_ms energy explicit id   
##           <dbl> <fct>          <dbl>       <int>  <dbl>    <int> <chr>
##  1       0.0302 ['The ~        0.533      259947  0.376        0 2Eql~
##  2       0.879  ['The ~        0.332      125667  0.179        0 3BQH~
##  3       0.0112 ['The ~        0.386      425653  0.607        0 0aym~
##  4       0.754  ['The ~        0.686      138387  0.127        0 5jgF~
##  5       0.641  ['The ~        0.482      155227  0.849        0 5ZBe~
##  6       0.386  ['The ~        0.49       145747  0.715        0 4pbG~
##  7       0.198  ['The ~        0.396      182293  0.338        0 0pNe~
##  8       0.336  ['The ~        0.39       247320  0.502        0 3Am0~
##  9       0.232  ['The ~        0.818      188960  0.728        0 1gFN~
## 10       0.0205 ['The ~        0.453      285000  0.654        0 389Q~
## # ... with 317 more rows, and 18 more variables: instrumentalness <dbl>,
## #   key <int>, liveness <dbl>, loudness <dbl>, mode <int>, name <fct>,
## #   popularity <int>, release_date <fct>, speechiness <dbl>, tempo <dbl>,
## #   valence <dbl>, year <int>, popularity_category <chr>, valence_bin <fct>,
## #   duration_min <dbl>, mode_type <chr>, key_str <chr>, key_group <chr>
scales_data_2010_2020 %>% 
  filter(artists=="['The Beatles']") %>% 
  pivot_longer(audio_features_2, names_to = "feature_name", values_to = "feature_value") %>% 
  ggplot(aes(x = feature_name, y = feature_value, color = feature_name)) +
  geom_jitter(size = 3, alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 90))

data_2010_2020 %>% 
  filter(artists=="['The Beatles']") %>% 
  group_by(year) %>% 
  summarise(mean_popularity = mean(popularity),
            total_popularity = sum(popularity),
            n_songs = n()) %>% 
  ungroup() %>% 
  ggplot(aes(x = year)) +
  geom_line(aes(y = mean_popularity), color = "blue") +
  geom_line(aes(y = n_songs), color = "green") +
  scale_x_continuous(breaks = seq(1910, 2020, 10))

data_2010_2020 %>% 
  filter(artists=="['Queen']") %>% 
  arrange(desc(popularity))
## # A tibble: 320 x 25
##    acousticness artists danceability duration_ms energy explicit id   
##           <dbl> <fct>          <dbl>       <int>  <dbl>    <int> <chr>
##  1       0.0472 ['Quee~        0.563      209413  0.865        0 7hQJ~
##  2       0.112  ['Quee~        0.933      214653  0.528        0 57JV~
##  3       0.679  ['Quee~        0.693      122067  0.497        0 54fl~
##  4       0.714  ['Quee~        0.599      163373  0.762        0 35It~
##  5       0.215  ['Quee~        0.304      261627  0.42         0 3Aym~
##  6       0.0319 ['Quee~        0.337      255600  0.684        0 4igI~
##  7       0.378  ['Quee~        0.268      179200  0.459        0 7ccI~
##  8       0.566  ['Quee~        0.545      173173  0.454        0 1mnQ~
##  9       0.057  ['Quee~        0.557      209600  0.761        0 3lrN~
## 10       0.414  ['Quee~        0.306      277827  0.686        0 3hU6~
## # ... with 310 more rows, and 18 more variables: instrumentalness <dbl>,
## #   key <int>, liveness <dbl>, loudness <dbl>, mode <int>, name <fct>,
## #   popularity <int>, release_date <fct>, speechiness <dbl>, tempo <dbl>,
## #   valence <dbl>, year <int>, popularity_category <chr>, valence_bin <fct>,
## #   duration_min <dbl>, mode_type <chr>, key_str <chr>, key_group <chr>
scales_data_2010_2020 %>% 
  filter(artists=="['Queen']") %>% 
  pivot_longer(audio_features_2, names_to = "feature_name", values_to = "feature_value") %>% 
  ggplot(aes(x = feature_name, y = feature_value, color = feature_name)) +
  geom_jitter(size = 3, alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 90))

data_2010_2020 %>% 
  filter(artists=="['Queen']") %>% 
  group_by(year) %>% 
  summarise(mean_popularity = mean(popularity),
            total_popularity = sum(popularity)) %>% 
  ungroup() %>% 
  ggplot(aes(x = year, y = mean_popularity)) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = seq(1910, 2020, 10))

data_2010_2020 %>% 
  filter(artists=="['Coldplay']") %>% 
  arrange(desc(popularity))
## # A tibble: 49 x 25
##    acousticness artists danceability duration_ms energy explicit id   
##           <dbl> <fct>          <dbl>       <int>  <dbl>    <int> <chr>
##  1      0.00239 ['Cold~        0.429      266773  0.661        0 3AJw~
##  2      0.164   ['Cold~        0.209      295533  0.417        0 7LVH~
##  3      0.0954  ['Cold~        0.486      242373  0.617        0 1mea~
##  4      0.0509  ['Cold~        0.449      278719  0.585        0 6nek~
##  5      0.131   ['Cold~        0.312      242496  0.418        0 4fzy~
##  6      0.211   ['Cold~        0.491      258267  0.693        0 3RiP~
##  7      0.00617 ['Cold~        0.545      267867  0.675        0 0FDz~
##  8      0.599   ['Cold~        0.577      307880  0.749        0 0BCP~
##  9      0.00205 ['Cold~        0.638      263787  0.924        0 69ux~
## 10      0.748   ['Cold~        0.371      227093  0.268        0 7D0R~
## # ... with 39 more rows, and 18 more variables: instrumentalness <dbl>,
## #   key <int>, liveness <dbl>, loudness <dbl>, mode <int>, name <fct>,
## #   popularity <int>, release_date <fct>, speechiness <dbl>, tempo <dbl>,
## #   valence <dbl>, year <int>, popularity_category <chr>, valence_bin <fct>,
## #   duration_min <dbl>, mode_type <chr>, key_str <chr>, key_group <chr>
scales_data_2010_2020 %>% 
  filter(artists=="['Coldplay']") %>% 
  pivot_longer(audio_features_2, names_to = "feature_name", values_to = "feature_value") %>% 
  ggplot(aes(x = feature_name, y = feature_value, color = feature_name)) +
  geom_jitter(size = 3, alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 90))

data_2010_2020 %>% 
  filter(artists=="['Coldplay']") %>% 
  group_by(year) %>% 
  summarise(mean_popularity = mean(popularity),
            total_popularity = sum(popularity)) %>% 
  ungroup() %>% 
  ggplot(aes(x = year, y = mean_popularity, label = year)) +
  geom_line() +
  geom_point() +
  geom_text() +
  scale_x_continuous(breaks = seq(1910, 2020, 10))